Required Variables and Datasets for Research Question 01
Code
dataset_names <- c("AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp",
"AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bbi",
"AHSnhs11bbi", "AHSnhs11bbi", "AHSnhs11bbi", "AHSnhs11bbi",
"AHSnhs11bbi", "AHSnhs11bmd", "AHSnhs11bmd", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp")
variable_names <- c("COBCODBC", "YOABC", "AGEB", "BMISC", "SYSTOL", "DIASTOL",
"SEX", "CHOLRESB", "HDLCHREB", "LDLRESB", "CHOLNTR",
"HDLCHSEX", "DIABPRVE", "ATCCURFB", "INSFLAG", "CARDASP", "DIABMQ01", "DIETQ8", "DIETQ5", "DIETQ12", "DIETQ14", "SALTACI", "SALTATI", "MILKFATU", "DIETRDI", "DIETQ1")
variable_descriptions <- c("Country of birth", "Year of arrival in Australia", "Age of person",
"Body Mass Index (BMI) - score measured", "Systolic Blood Pressure (mmHg)",
"Diastolic Blood Pressure (mmHg)", "Sex of person",
"Total cholesterol - ranged (mmol/L)", "HDL cholesterol - ranged (mmol/L)",
"Fasting LDL cholesterol - ranged (mmol/L)", "Total cholesterol status (mmol/L)",
"HDL cholesterol status - sex dependent (mmol/L)",
"Diabetes prevalence - HbA1c (%)", "Type of medication taken in last 2 weeks (code)",
"Whether insulin used daily", "Whether uses aspirin daily for heart or circulatory condition" , "Currently having insulin every day", "Usual daily serves of fruit", "Usual daily serves of vegetables", "How often salt is used in household for cooking or preparing food", "How often salt is added to food at table", "Whether salt used in cooking or preparing food is iodised", "Whether salt added to meal at table is iodised" ,"Fat content of main type of milk usually consumed", "Whether vegetable and fruit consumption met recommended guidelines", "Main type of milk usually consumed")
variable_df1 <- data.frame(Dataset = dataset_names, Variable_Name = variable_names, Variable_Description = variable_descriptions)
variable_df1Required Variables and Datasets for Research Question 02
Code
dataset_names <- c(
"AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp",
"AHSnhs11bbi",
"AHSnhs11bmd", "AHSnhs11bmd",
"AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp",
"AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp",
"AHSnhs11bsp", "AHSnhs11bsp", "AHSnhs11bsp",
"AHSnhs11bsp"
)
variable_names <- c(
"COBCODBC", "YOABC", "AGEB", "BMISC", "SYSTOL", "DIASTOL", "SEX", "DIABPRVE",
"ATCCURFB", "INSFLAG", "CARDASP", "SMKSTAT", "ALCUSUQ2", "TOTPAC", "ALCSTR01",
"ALCWKLY", "PHDKGW2", "ALINTWK", "EXREGUIC", "EXREGUID", "EXFSRMNE",
"EXLWMBC", "EXLWVBC",
"DIETRDI"
)
variable_descriptions <- c(
"Country of birth", "Year of arrival in Australia", "Age of person", "Body Mass Index (BMI) - score measured",
"Systolic Blood Pressure (mmHg)", "Diastolic Blood Pressure (mmHg)", "Sex of person", "Diabetes prevalence - HbA1c (%)",
"Type of medication taken in last 2 weeks (code)", "Whether insulin used daily", "Whether uses aspirin daily for heart or circulatory condition",
"Smoker status", "Frequency of alcohol consumption in the last 12 months", "Mls of pure alcohol consumed", "Short term alcohol risk (2001 Guidelines)", "Estimated total weekly consumption (in mls)", "Measured weight (kg)", "Average daily intake over week (in mls)",
"Whether exercise last week met 150 minutes recommended guidelines", "Whether exercise last week met 150 minutes and 5 sessions recommended guidelines",
"Total minutes walked for fitness, recreation or sport in last week", "Total minutes undertaken moderate exercise in last week",
"Total minutes undertaken vigorous exercise last week",
"Whether vegetable and fruit consumption met recommended guidelines"
)
variable_df2 <- data.frame(Dataset = dataset_names, Variable_Name = variable_names, Variable_Description = variable_descriptions)
variable_df2Confounding Variables
The following represents the confounding medication variables that we will need to look at further. All individuals who are taking the following medication will be excluded from the analysis
- CARDASP: Whether uses aspirin daily for heart or circulatory condition (medication for hypertension)
- Not applicable
- Aspirin used daily for heart or circulatory condition
- Aspirin not used daily for heart or circulatory condition
- Don’t know if aspirin used daily for heart or circulatory condition
- Not stated if aspirin used daily for heart or circulatory condition
- DIABMQ01: Currently having insulin every day
- Not applicable
- Currently having insulin every day
- Not currently having insulin every day
- Don’t know if currently having insulin every day
- ATCCURFB: Medication B01 17430. ANTITHROMBOTIC AGENTS B06 20190. OTHER HEMATOLOGICAL AGENTS C01 20370. CARDIAC THERAPY C02 21950. ANTIHYPERTENSIVES C03 23030. DIURETICS C04 24020. PERIPHERAL VASODILATORS C05 24460. VASOPROTECTIVES C07 25090. BETA BLOCKING AGENTS C08 25910. CALCIUM CHANNEL BLOCKERS C09 26300. AGENTS ACTING ON THE RENIN-ANGIOTENSIN SYSTEM C10 27090. LIPID MODIFYING AGENTS
Code
raw_AHSnhs11bsp <- read_csv("data/AHSnhs11bsp.csv")
raw_AHSnhs11bbi <- read_csv("data/AHSnhs11bbi.csv")
raw_AHSnhs11bmd <- read_csv("data/AHSnhs11bmd.csv")
raw_AHSnhs11bsp$ID <- paste(raw_AHSnhs11bsp$ABSLID, raw_AHSnhs11bsp$ABSPID, sep = "-")
raw_AHSnhs11bbi$ID <- paste(raw_AHSnhs11bbi$ABSLID, raw_AHSnhs11bbi$ABSPID, sep = "-")
raw_AHSnhs11bmd$ID <- paste(raw_AHSnhs11bmd$ABSLID, raw_AHSnhs11bmd$ABSPID, sep = "-")Variable Selection
We perform separate variable filtering for each question in order to retain as many observations as possible during the cleaning process.
Research Question 1
The following code gets pulls the variables as defined in the Data Import section.
Code
df1 <- raw_AHSnhs11bsp[, c('ID', variable_df1 |> filter(Dataset == 'AHSnhs11bsp') |> pull(Variable_Name))]
df2<- raw_AHSnhs11bbi[, c('ID', variable_df1 |> filter(Dataset == 'AHSnhs11bbi') |> pull(Variable_Name))]
df3<- raw_AHSnhs11bmd[, c('ID', variable_df1 |> filter(Dataset == 'AHSnhs11bmd') |> pull(Variable_Name))]
df_q1 <- df1 |>
full_join(df2, by = "ID") |>
full_join(df3, by = "ID")
df_q1 <- as.data.frame(df_q1)Research Question 2
Code
df1 <- raw_AHSnhs11bsp[, c('ID', variable_df2 |> filter(Dataset == 'AHSnhs11bsp') |> pull(Variable_Name))]
df2<- raw_AHSnhs11bbi[, c('ID', variable_df2 |> filter(Dataset == 'AHSnhs11bbi') |> pull(Variable_Name))]
df3<- raw_AHSnhs11bmd[, c('ID', variable_df2 |> filter(Dataset == 'AHSnhs11bmd') |> pull(Variable_Name))]
df_q2 <- df1 |>
full_join(df2, by = "ID") |>
full_join(df3, by = "ID")
df_q2 <- as.data.frame(df_q2)Variable Type
The following lines of code converts the variables to the correct type.
Question 01
Code
for (i in 1:nrow(types_biom)) {
var_name <- types_biom$variable_name[i]
var_type <- types_biom$variable_type[i]
if (var_name %in% colnames(df_q1)) {
if (var_type == 'string') {
df_q1[[var_name]] <- as.character(df_q1[[var_name]])
} else if (var_type == 'numeric') {
df_q1[[var_name]] <- as.numeric(df_q1[[var_name]])
} else if (var_type == 'categorical') {
df_q1[[var_name]] <- as.factor(df_q1[[var_name]])
}
}
}Question 02
Code
for (i in 1:nrow(types_biom)) {
var_name <- types_biom$variable_name[i]
var_type <- types_biom$variable_type[i]
if (var_name %in% colnames(df_q2)) {
if (var_type == 'string') {
df_q2[[var_name]] <- as.character(df_q2[[var_name]])
} else if (var_type == 'numeric') {
df_q2[[var_name]] <- as.numeric(df_q2[[var_name]])
} else if (var_type == 'categorical') {
df_q2[[var_name]] <- as.factor(df_q2[[var_name]])
}
}
}Code
df_q2[, c("COBCODBC", "YOABC", "CARDASP", "SMKSTAT", "ALCUSUQ2", "ALCSTR01", "EXREGUIC", "EXREGUID", "DIETRDI", "DIABPRVE", "ATCCURFB", "INSFLAG")] <- lapply(df_q2[, c("COBCODBC", "YOABC", "CARDASP", "SMKSTAT", "ALCUSUQ2", "ALCSTR01", "EXREGUIC", "EXREGUID", "DIETRDI", "DIABPRVE", "ATCCURFB", "INSFLAG")], as.factor)Missing Values
The following code deals with implicit missing values
Code
miss_defs <- c("measurement not taken - equipment faulty",
"measurement not taken - other reason",
"measurement not taken - refusal",
"not collected",
"not applicable",
"not determined",
"not known",
"not known if currently on a diet",
"not measured",
"not reported",
"not stated",
"not used",
"not taken",
"not recorded",
"no answer",
"not stated",
"valid reading not obtained",
"not obtained")
raw_to_tech <- function(proc, special, types)
{
var_names <- colnames(proc)
for (j in 1:length(var_names))
{
var_val <- var_names[j]
specials <- special %>%
filter(variable_name==var_val)
if (nrow(specials)>0)
{
ind <- which(var_names==var_val)
var_miss_str <- paste0(var_val,"_MISS")
var_miss_reas <- rep("observed",nrow(proc))
var_vals <- proc[,ind]
var_type <- types %>%
filter(variable_name==var_val) %>%
select(variable_type) %>%
as.character()
if (var_type=="numeric") {
for (i in 1:length(var_vals)) {
specials <- specials %>% filter(str_detect(meaning, paste(miss_defs, collapse = "|")))
if (var_vals[i] %in% specials$value) {
ind2 <- which(var_vals[i]==specials$value)
var_vals[i] <- NA
var_miss_reas[i] <- specials[ind2,3] %>% as.character()
}
}
}
if (var_type=="categorical") {
for (i in 1:length(var_vals)) {
spec_val <- specials$value
spec_meam <- specials$meaning
if (var_vals[i] %in% spec_val)
{
var_mean <- spec_meam[var_vals[i] == spec_val]
if (var_mean %in% miss_defs) {
var_vals[i] <- NA
var_miss_reas[i] <- var_mean
}
} else {
var_vals[i] <- NA
var_miss_reas[i] <- "unknown"
}
}
}
if (any(is.na(var_vals))) {
proc[,ind] <- var_vals
proc$dummy <- var_miss_reas
colnames(proc)[ncol(proc)] <- var_miss_str
}
}
}
return(proc)
}
tech_q1 <- raw_to_tech(df_q1, special_biom, types_biom)
tech_q2 <- raw_to_tech(df_q2, special_biom, types_biom)The following code is to further clean any and all values above 9995 and assign them NA values.
Code
update_large_values <- function(data, threshold = 9995, reason = "Missing due to unknown reasons") {
numeric_cols <- colnames(data)[sapply(data, is.numeric)]
for (col in numeric_cols) {
# Set specific thresholds for columns
col_threshold <- if (col == "ATCCURFB") {
99995
} else if (col == "PHDKGW2") {
995
} else {
threshold
}
miss_column <- paste0(col, "_MISS")
if (!miss_column %in% colnames(data)) {
data[[miss_column]] <- "observed"
}
# Update values based on the threshold
data[[miss_column]][data[[col]] > col_threshold] <- reason
data[[col]] <- ifelse(data[[col]] > col_threshold, NA, data[[col]])
# Special case for PHDKGW2: change 0 values to NA
if (col == "PHDKGW2") {
data[[col]] <- ifelse(data[[col]] == 0, NA, data[[col]])
}
}
return(data)
}
tech_q1 <- update_large_values(tech_q1)
tech_q2 <- update_large_values(tech_q2)Visualising Missing Values
Code
g <- tech_q2 %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram() + theme_bw() + ggtitle('Histogram of Numeric Variables')
gData looks technically correct now, without substantial or significant missing values that stand out.
Preparation
The following are the given categorical bins for LDL and HDL variables.
HDLCHREB HDL cholesterol - ranged (mmol/L) 1. Less than 1.0 2. 1.0 to less than 1.3 3. 1.3 to less than 1.5 4. 1.5 to less than 2.0 5. 2.0 to less than 2.5 6. 2.5 or more 7. Not applicable 8. Not reported
LDLRESB Fasting LDL cholesterol - ranged (mmol/L) 01. Less than 1.5 02. 1.5 to less than 2.0 03. 2.0 to less than 2.5 04. 2.5 to less than 3.0 05. 3.0 to less than 3.5 06. 3.5 to less than 4.0 07. 4.0 to less than 4.5 08. 4.5 or more 97. Not applicable 98. Not reported
Using this, we will convert this categorical and right censored data to numeric with the following code.
HDL cholesterol (HDLCHREB)
The following code uses the fitdistrplus to perform conversion from categorical to numerical given the bins and sample sizes of each gender and measure found from the published data dictionary. Details on the method can be found in the paper.
Code
library(fitdistrplus)
expected_value_interval_norm <- function(mean, sd, lower, upper) {
if (is.infinite(upper)) {
upper <- mean + 10 * sd
}
numerator <- integrate(function(x) x * dnorm(x, mean, sd), lower, upper)$value
denominator <- pnorm(upper, mean, sd) - pnorm(lower, mean, sd)
expected_value <- numerator / denominator
return(expected_value)
}
hdl_male <- data.frame(
Range = c("<1.0", "1.0-1.3", "1.3-1.5", "1.5-2.0", "2.0-2.5", "≥2.5"),
Lower = c(0, 1.0, 1.3, 1.5, 2.0, 2.5),
Upper = c(1.0, 1.3, 1.5, 2.0, 2.5, 3.5),
Count = c(1591.6, 3468.8, 1774.9, 1327.0, 125.4, 22.5)
)
hdl_male$Count <- round(hdl_male$Count * 1000)
hdl_male_cens <- data.frame(
left = hdl_male$Lower,
right = hdl_male$Upper,
count = hdl_male$Count
)
hdl_male_cens_expanded <- hdl_male_cens[rep(1:nrow(hdl_male_cens), hdl_male_cens$count), 1:2]
fit_hdl_male <- fitdistcens(hdl_male_cens_expanded, "norm")
hdl_male$ExpectedValue <- mapply(
expected_value_interval_norm,
mean = fit_hdl_male$estimate["mean"],
sd = fit_hdl_male$estimate["sd"],
lower = hdl_male$Lower,
upper = hdl_male$Upper
)
hdl_female <- data.frame(
Range = c("<1.0", "1.0-1.3", "1.3-1.5", "1.5-2.0", "2.0-2.5", "≥2.5"),
Lower = c(0, 1.0, 1.3, 1.5, 2.0, 2.5),
Upper = c(1.0, 1.3, 1.5, 2.0, 2.5, 3.5),
Count = c(411.3, 1938.4, 1970.4, 3304.5, 814.3, 72.2)
)
hdl_female$Count <- round(hdl_female$Count * 1000)
hdl_female_cens <- data.frame(
left = hdl_female$Lower,
right = hdl_female$Upper,
count = hdl_female$Count
)
hdl_female_cens_expanded <- hdl_female_cens[rep(1:nrow(hdl_female_cens), hdl_female_cens$count), 1:2]
fit_hdl_female <- fitdistcens(hdl_female_cens_expanded, "norm")
hdl_female$ExpectedValue <- mapply(
expected_value_interval_norm,
mean = fit_hdl_female$estimate["mean"],
sd = fit_hdl_female$estimate["sd"],
lower = hdl_female$Lower,
upper = hdl_female$Upper
)
hdl_male$HDL_Category = c(1:6)
hdl_female$HDL_Category = c(1:6)The following code plots the empirical vs fitted male HDL distribution.
Code
library(ggplot2)
hdl_male$Proportion <- hdl_male$Count / sum(hdl_male$Count)
hdl_female$Proportion <- hdl_female$Count / sum(hdl_female$Count)
max_proportion_female <- max(hdl_female$Proportion)
max_proportion_male <- max(hdl_male$Proportion)
ggplot(hdl_male, aes(x = ExpectedValue, y = Proportion)) +
geom_bar(stat = "identity", fill = "lightblue", alpha = 0.6) +
stat_function(fun = function(x) dnorm(x, mean = fit_hdl_male$estimate["mean"], sd = fit_hdl_male$estimate["sd"]) *
max_proportion_male, color = "blue", size = 1) +
labs(title = "Empirical vs. Fitted Proportion Distribution of HDL in Males",
x = "HDL Level", y = "Proportion") +
theme_bw()Code
ggplot(hdl_female, aes(x = ExpectedValue, y = Proportion)) +
geom_bar(stat = "identity", fill = "pink", alpha = 0.6) +
stat_function(fun = function(x) dnorm(x, mean = fit_hdl_female$estimate["mean"], sd = fit_hdl_female$estimate["sd"]) *
max_proportion_female, color = "red", size = 1) +
labs(title = "Empirical vs. Fitted Proportion Distribution of HDL in Females",
x = "HDL Level", y = "Proportion") +
theme_bw()Code
hdl_combined <- rbind(
transform(hdl_male, Sex = "Male"),
transform(hdl_female, Sex = "Female")
)
ggplot(hdl_combined, aes(x = HDL_Category, y = ExpectedValue, color = Sex, group = Sex)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(title = "Expected HDL Levels by Category for Males and Females",
x = "HDL Category", y = "Expected HDL Level") +
theme_minimal() +
scale_x_continuous(breaks = 1:6, labels = hdl_male$Range) +
theme(legend.position = "top")The following code saves the data.
Code
hdl_male_lookup <- data.frame(
HDLCHREB = 1:6,
HDL_Expected = hdl_male$ExpectedValue
)
hdl_female_lookup <- data.frame(
HDLCHREB = 1:6,
HDL_Expected = hdl_female$ExpectedValue
)
tech_q1$HDLCHREB <- as.numeric(tech_q1$HDLCHREB)
tech_q1$SEX <- as.character(tech_q1$SEX)
get_hdl_expected <- function(sex, hdlchreb) {
if (sex == 2) {
return(hdl_female_lookup$HDL_Expected[match(hdlchreb, hdl_female_lookup$HDLCHREB)])
} else if (sex == 1) {
return(hdl_male_lookup$HDL_Expected[match(hdlchreb, hdl_male_lookup$HDLCHREB)])
} else {
return(NA) # Handle cases where SEX is missing or unspecified
}
}
tech_q1$HDL_Expected <- mapply(get_hdl_expected, tech_q1$SEX, tech_q1$HDLCHREB)LDL cholesterol (LDLRESB)
Follows the same description as previous.
Code
library(fitdistrplus)
expected_value_interval_norm <- function(mean, sd, lower, upper) {
if (is.infinite(upper)) {
upper <- mean + 10 * sd
}
numerator <- integrate(function(x) x * dnorm(x, mean, sd), lower, upper)$value
denominator <- pnorm(upper, mean, sd) - pnorm(lower, mean, sd)
expected_value <- numerator / denominator
return(expected_value)
}
ldl_male <- data.frame(
Range = c(
"<1.5",
"1.5-2.0",
"2.0-2.5",
"2.5-3.0",
"3.0-3.5",
"3.5-4.0",
"4.0-4.5",
">=4.5"
),
Lower = c(0.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5),
Upper = c(1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 7),
Count = c(90.0, 426.5, 851.6, 1395.7, 1447.9, 1129.9, 580.1, 621.8)
)
ldl_male$Count <- round(ldl_male$Count * 1000)
ldl_male_cens <- data.frame(
left = ldl_male$Lower,
right = ldl_male$Upper,
count = ldl_male$Count
)
ldl_male_cens_expanded <- ldl_male_cens[rep(1:nrow(ldl_male_cens), ldl_male_cens$count), 1:2]
fit_ldl_male <- fitdistcens(ldl_male_cens_expanded, "norm")
ldl_male$ExpectedValue <- mapply(
expected_value_interval_norm,
mean = fit_ldl_male$estimate["mean"],
sd = fit_ldl_male$estimate["sd"],
lower = ldl_male$Lower,
upper = ldl_male$Upper
)
ldl_female <- data.frame(
Range = c(
"<1.5",
"1.5-2.0",
"2.0-2.5",
"2.5-3.0",
"3.0-3.5",
"3.5-4.0",
"4.0-4.5",
">=4.5"
),
Lower = c(0.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5),
Upper = c(1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 7.0),
Count = c(92.3, 442.7, 1094.5, 1665.1, 1379.4, 1116.7, 546.5, 501.1)
)
ldl_female$Count <- round(ldl_female$Count * 1000)
ldl_female_cens <- data.frame(
left = ldl_female$Lower,
right = ldl_female$Upper,
count = ldl_female$Count
)
ldl_female_cens_expanded <- ldl_female_cens[rep(1:nrow(ldl_female_cens), ldl_female_cens$count), 1:2]
fit_ldl_female <- fitdistcens(ldl_female_cens_expanded, "norm")
ldl_female$ExpectedValue <- mapply(
expected_value_interval_norm,
mean = fit_ldl_female$estimate["mean"],
sd = fit_ldl_female$estimate["sd"],
lower = ldl_female$Lower,
upper = ldl_female$Upper
)
ldl_male$LDL_Category = c(1:8)
ldl_female$LDL_Category = c(1:8)Code
ldl_male_lookup <- data.frame(
LDLRESB = 1:8,
LDL_Expected = ldl_male$ExpectedValue
)
ldl_female_lookup <- data.frame(
LDLRESB = 1:8,
LDL_Expected = ldl_female$ExpectedValue
)
tech_q1$LDLRESB <- as.numeric(tech_q1$LDLRESB)
tech_q1$SEX <- as.character(tech_q1$SEX)
get_ldl_expected <- function(sex, LDLRESB) {
if (sex == 2) {
return(ldl_female_lookup$LDL_Expected[match(LDLRESB, ldl_female_lookup$LDLRESB)])
} else if (sex == 1) {
return(ldl_male_lookup$LDL_Expected[match(LDLRESB, ldl_male_lookup$LDLRESB)])
} else {
return(NA) # Handle cases where SEX is missing or unspecified
}
}
tech_q1$LDL_Expected <- mapply(get_ldl_expected, tech_q1$SEX, tech_q1$LDLRESB)Code
ldl_male$Proportion <- ldl_male$Count / sum(ldl_male$Count)
ldl_female$Proportion <- ldl_female$Count / sum(ldl_female$Count)
max_proportion_male <- max(ldl_male$Proportion)
max_proportion_female <- max(ldl_female$Proportion)
max_density_male <- dnorm(fit_ldl_male$estimate["mean"], mean = fit_ldl_male$estimate["mean"], sd = fit_ldl_male$estimate["sd"])
max_density_female <- dnorm(fit_ldl_female$estimate["mean"], mean = fit_ldl_female$estimate["mean"], sd = fit_ldl_female$estimate["sd"])
scale_factor_male <- max_proportion_male / max_density_male
scale_factor_female <- max_proportion_female / max_density_female
ggplot(ldl_male, aes(x = ExpectedValue, y = Proportion)) +
geom_bar(stat = "identity", fill = "lightblue", alpha = 0.6) +
stat_function(fun = function(x) dnorm(x, mean = fit_ldl_male$estimate["mean"], sd = fit_ldl_male$estimate["sd"]) *
scale_factor_male, color = "blue", size = 1) +
labs(title = "Empirical vs. Fitted Proportion Distribution of LDL in Males",
x = "LDL Level", y = "Proportion") +
theme_minimal()Code
ggplot(ldl_female, aes(x = ExpectedValue, y = Proportion)) +
geom_bar(stat = "identity", fill = "pink", alpha = 0.6) +
stat_function(fun = function(x) dnorm(x, mean = fit_ldl_female$estimate["mean"], sd = fit_ldl_female$estimate["sd"]) *
scale_factor_female, color = "red", size = 1) +
labs(title = "Empirical vs. Fitted Proportion Distribution of LDL in Females",
x = "LDL Level", y = "Proportion") +
theme_minimal()Code
ldl_combined <- rbind(
transform(ldl_male, Sex = "Male"),
transform(ldl_female, Sex = "Female")
)
ggplot(ldl_combined, aes(x = LDL_Category, y = ExpectedValue, color = Sex, group = Sex)) +
geom_line(size = 1) +
geom_point(size = 2) +
labs(title = "Expected LDL Levels by Category for Males and Females",
x = "LDL Category", y = "Expected LDL Level") +
theme_minimal() +
scale_x_continuous(breaks = 1:8, labels = ldl_male$Range) +
theme(legend.position = "top")Excluding people with medication
LDL-HDL Ratio
Code
non_medication_tech_q1$LDL_HDL_ratio <- non_medication_tech_q1$LDL_Expected / non_medication_tech_q1$HDL_ExpectedSelecting only required variables
Code
Code
table(final_q1$COBCODBC)
1 2 3
2891 506 574
Code
table(final_q1$YOABC)
1 2 3 4 5 6 7
2891 536 86 66 89 123 180
Filtering more columns of unknown responses
Regrouping Fruit and Vegetables column
IDA
SALTATI
Code
# Define labels for COBCODBC
labels <- c("Australians", "Main English Speaking Countries", "Other")
# Create the plot with relabeled legend
ggplot(final_q1, aes(x = SALTATI, y = LDL_Expected, color = COBCODBC, group = COBCODBC)) +
stat_summary(fun = "mean", geom = "point", size = 3) + # Points for means
stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC)) + # Line connecting means
labs(title = "Interaction Plot: SALTATI and Absolute LDL by Country of Birth",
x = "SALTATI",
y = "Absolute LDL (mmol/L)",
color = "Country of Birth") + # Change legend title
scale_color_manual(values = c("lightblue3", "tomato3", "palegreen4"), labels = labels) +
theme_bw()Code
# Define labels for COBCODBC
labels <- c("Australians", "Main English Speaking Countries", "Other")
# Create the plot with relabeled legend
ggplot(final_q1, aes(x = SALTATI, y = LDL_HDL_ratio, color = COBCODBC, group = COBCODBC)) +
stat_summary(fun = "mean", geom = "point", size = 3) + # Points for means
stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC), size = 1) + # Line connecting means
labs(title = "Interaction Plot: SALTATI and LDL/HDL Ratio by Country of Birth",
x = "SALTATI",
y = "LDL/HDL Ratio",
color = "Country of Birth") + # Change legend title
scale_color_manual(values = c("lightblue3", "tomato3", "palegreen4"), labels = labels) +
theme_bw()SALTACI
Code
table(final_q1$SALTACI , final_q1$COBCODBC )
1 2 3
1 750 160 257
2 450 74 159
3 1443 238 86
Code
final_q1$SALTACI <- as.factor(final_q1$SALTACI)
ggplot(final_q1, aes(x = SALTACI, y = LDL_HDL_ratio, color = COBCODBC, group = COBCODBC)) +
# geom_boxplot(position = position_dodge(width = 0.75)) + # Boxplot for each level of DIETQ5
stat_summary(fun = "mean", geom = "point", size = 3) + # Points for means
stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC)) + # Line connecting means
labs(title = "Interaction Plot: Categorical DIETQ5 and LDL/HDL Ratio by Country of Birth",
x = "SALTACI",
y = "LDL/HDL Ratio") +
theme_bw()MILKFATU
Code
final_q1$MILKFATU <- as.factor(final_q1$MILKFATU)
ggplot(final_q1, aes(x = MILKFATU, y = LDL_HDL_ratio, color = COBCODBC, group = COBCODBC)) +
# geom_boxplot(position = position_dodge(width = 0.75)) + # Boxplot for each level of DIETQ5
stat_summary(fun = "mean", geom = "point", size = 3) + # Points for means
stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC)) + # Line connecting means
labs(title = "Interaction Plot: Categorical DIETQ5 and LDL/HDL Ratio by Country of Birth",
x = "MILKFATU",
y = "LDL/HDL Ratio") +
theme_bw()Code
table(final_q1$MILKFATU,final_q1$COBCODBC )
1 2 3
1 993 160 241
2 1070 197 159
3 451 97 78
5 129 18 24
DIETRDI
Code
final_q1$DIETRDI <- as.factor(final_q1$DIETRDI)
ggplot(final_q1, aes(x = DIETRDI, y = LDL_HDL_ratio, color = COBCODBC, group = COBCODBC)) +
# geom_boxplot(position = position_dodge(width = 0.75)) + # Boxplot for each level of DIETQ5
stat_summary(fun = "mean", geom = "point", size = 3) + # Points for means
stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC)) + # Line connecting means
labs(title = "Interaction Plot: Categorical DIETQ5 and LDL/HDL Ratio by Country of Birth",
x = "DIETRDI",
y = "LDL/HDL Ratio") +
theme_bw()Code
table(final_q1$MILKFATU,final_q1$COBCODBC )
1 2 3
1 993 160 241
2 1070 197 159
3 451 97 78
5 129 18 24
DIETQ1
Code
final_q1$DIETQ1 <- as.factor(final_q1$DIETQ1)
ggplot(final_q1, aes(x = DIETQ1, y = LDL_HDL_ratio, color = COBCODBC, group = COBCODBC)) +
# geom_boxplot(position = position_dodge(width = 0.75)) + # Boxplot for each level of DIETQ5
stat_summary(fun = "mean", geom = "point", size = 3) + # Points for means
stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC)) + # Line connecting means
labs(title = "Interaction Plot: Categorical DIETQ5 and LDL/HDL Ratio by Country of Birth",
x = "DIETQ1",
y = "LDL/HDL Ratio") +
theme_bw()DIETQ5
Code
ggplot(final_q1, aes(x = DIETQ5, y = LDL_HDL_ratio, color = COBCODBC)) +
geom_point()+
geom_smooth(method = "lm", se = FALSE) + facet_wrap(~ COBCODBC, scales = "free_y") +
labs(title = "LDL/HDL Ratio against DIETQ5 and COBCODBC", x = "DIETQ5",y = "LDL/HDL Ratio") +
theme_bw()DIETQ8
Code
labels <- c("Australians", "Main English Speaking Countries", "Other")
ggplot(final_q1, aes(x = DIETQ8, y = LDL_Expected, color = COBCODBC)) +
geom_point(position = 'jitter', alpha = 0.2) +
geom_smooth(method = "lm", se = FALSE) + # Add regression lines for each group
labs(title = "Interaction Between Fruit Intake (DIETQ8) and Absolute LDL by Country of Birth",
x = "Fruit Intake (DIETQ8)",
y = "Absolute LDL") + facet_wrap(~ COBCODBC, scales = "free_y") +
theme_bw()+ # Change legend title
scale_color_manual(values = c("lightblue4", "tomato3", "palegreen4"), labels = labels) +
theme_bw()Code
labels <- c("Australians", "Main English Speaking Countries", "Other")
ggplot(final_q1, aes(x = DIETQ8, y = LDL_HDL_ratio, color = COBCODBC)) +
geom_point(position = 'jitter', alpha = 0.2) +
geom_smooth(method = "lm", se = FALSE) + # Add regression lines for each group
labs(title = "Interaction Between Fruit Intake (DIETQ8) and LDL/HDL Ratio by Country of Birth",
x = "Fruit Intake (DIETQ8)",
y = "LDL/HDL Ratio") + facet_wrap(~ COBCODBC, scales = "free_y") +
theme_bw()+ # Change legend title
scale_color_manual(values = c("lightblue4", "tomato3", "palegreen4"), labels = labels) +
theme_bw()DIETQ5 and DIETQ8
Code
# Heatmap showing the interaction between DIETQ5 and DIETQ8 on LDL/HDL ratio
ggplot(final_q1, aes(x = DIETQ5, y = DIETQ8, fill = LDL_HDL_ratio)) +
geom_tile() +
facet_wrap(~COBCODBC) + # Separate plots for each country
labs(title = "Heatmap of LDL/HDL Ratio Based on Vegetable and Fruit Intake",
x = "Vegetable Intake (DIETQ5)",
y = "Fruit Intake (DIETQ8)",
fill = "LDL/HDL Ratio") +
theme_bw()Linear Mixed Effects Model
All Interactions
Code
final_q1$DIETQ8 <- as.numeric(final_q1$DIETQ8 )
final_q1$DIETQ5 <- as.numeric(final_q1$DIETQ5 )
final_q1$DIETQ1 <- as.factor(final_q1$DIETQ1 )
final_q1$SALTATI <- as.factor(final_q1$SALTATI )
final_q1$SALTACI <- as.factor(final_q1$SALTACI )
final_q1$MILKFATU <- as.factor(final_q1$MILKFATU )Code
model_ldl_hdl <- lmer(
LDL_Expected ~ +
(DIETQ5 + DIETQ8 + MILKFATU + DIETQ1 ) * COBCODBC +
(1 | AGEB),
data = final_q1
)
summary(model_ldl_hdl)Linear mixed model fit by REML ['lmerMod']
Formula: LDL_Expected ~ +(DIETQ5 + DIETQ8 + MILKFATU + DIETQ1) * COBCODBC +
(1 | AGEB)
Data: final_q1
REML criterion at convergence: 8767.6
Scaled residuals:
Min 1Q Median 3Q Max
-2.98604 -0.66729 -0.05657 0.63333 2.85462
Random effects:
Groups Name Variance Std.Dev.
AGEB (Intercept) 0.09681 0.3111
Residual 0.63989 0.7999
Number of obs: 3617, groups: AGEB, 17
Fixed effects:
Estimate Std. Error t value
(Intercept) 3.189186 0.086482 36.877
DIETQ5 -0.009637 0.012018 -0.802
DIETQ8 -0.008550 0.013639 -0.627
MILKFATU2 -0.060393 0.035610 -1.696
MILKFATU3 -0.127196 0.045862 -2.773
MILKFATU5 -0.009948 0.075388 -0.132
DIETQ12 -0.155029 0.069825 -2.220
COBCODBC2 0.108738 0.105756 1.028
COBCODBC3 0.057770 0.096541 0.598
DIETQ5:COBCODBC2 0.034987 0.031074 1.126
DIETQ5:COBCODBC3 0.003869 0.030519 0.127
DIETQ8:COBCODBC2 -0.110004 0.036098 -3.047
DIETQ8:COBCODBC3 -0.054198 0.035150 -1.542
MILKFATU2:COBCODBC2 -0.022444 0.092676 -0.242
MILKFATU3:COBCODBC2 -0.039699 0.113146 -0.351
MILKFATU5:COBCODBC2 -0.188127 0.214042 -0.879
MILKFATU2:COBCODBC3 0.033335 0.090315 0.369
MILKFATU3:COBCODBC3 0.070173 0.114892 0.611
MILKFATU5:COBCODBC3 -0.106912 0.187669 -0.570
DIETQ12:COBCODBC2 0.282540 0.189091 1.494
DIETQ12:COBCODBC3 0.242963 0.137189 1.771
fit warnings:
fixed-effect model matrix is rank deficient so dropping 3 columns / coefficients
Assumptions
Linearity
Normality
CLT -> Normality is okay
Random effect distribution
Redo Analysis Using Absolute LDL
Code
model_ldl <- lmer(
LDL_Expected ~ +
(DIETQ5 + DIETQ8 + MILKFATU + SALTATI + SALTACI + DIETQ1) * COBCODBC +
(1 | AGEB),
data = final_q1
)
summary(model_ldl)Linear mixed model fit by REML ['lmerMod']
Formula: LDL_Expected ~ +(DIETQ5 + DIETQ8 + MILKFATU + SALTATI + SALTACI +
DIETQ1) * COBCODBC + (1 | AGEB)
Data: final_q1
REML criterion at convergence: 8789.1
Scaled residuals:
Min 1Q Median 3Q Max
-3.0181 -0.6706 -0.0460 0.6365 2.8711
Random effects:
Groups Name Variance Std.Dev.
AGEB (Intercept) 0.09599 0.3098
Residual 0.63856 0.7991
Number of obs: 3617, groups: AGEB, 17
Fixed effects:
Estimate Std. Error t value
(Intercept) 3.2194366 0.0907878 35.461
DIETQ5 -0.0112486 0.0120278 -0.935
DIETQ8 -0.0071896 0.0136769 -0.526
MILKFATU2 -0.0523478 0.0358692 -1.459
MILKFATU3 -0.1142791 0.0465255 -2.456
MILKFATU5 -0.0004972 0.0755298 -0.007
SALTATI2 0.0422872 0.0511258 0.827
SALTATI3 -0.0273859 0.0385363 -0.711
SALTACI2 -0.0228724 0.0532994 -0.429
SALTACI3 -0.0427701 0.0379499 -1.127
DIETQ12 -0.1519029 0.0698199 -2.176
COBCODBC2 -0.1103915 0.1223605 -0.902
COBCODBC3 0.0157575 0.1255820 0.125
DIETQ5:COBCODBC2 0.0358454 0.0311736 1.150
DIETQ5:COBCODBC3 0.0053220 0.0306631 0.174
DIETQ8:COBCODBC2 -0.1216838 0.0362321 -3.358
DIETQ8:COBCODBC3 -0.0574787 0.0351771 -1.634
MILKFATU2:COBCODBC2 -0.0610342 0.0940963 -0.649
MILKFATU3:COBCODBC2 -0.0868113 0.1165373 -0.745
MILKFATU5:COBCODBC2 -0.2281050 0.2145183 -1.063
MILKFATU2:COBCODBC3 0.0286219 0.0904858 0.316
MILKFATU3:COBCODBC3 0.0697412 0.1162668 0.600
MILKFATU5:COBCODBC3 -0.1240031 0.1887486 -0.657
SALTATI2:COBCODBC2 0.1001062 0.1367455 0.732
SALTATI3:COBCODBC2 0.3067783 0.0991748 3.093
SALTATI2:COBCODBC3 -0.0055759 0.1586705 -0.035
SALTATI3:COBCODBC3 0.0480577 0.1048075 0.459
SALTACI2:COBCODBC2 0.0516511 0.1407191 0.367
SALTACI3:COBCODBC2 0.1514477 0.0971475 1.559
SALTACI2:COBCODBC3 0.0478579 0.1051472 0.455
SALTACI3:COBCODBC3 -0.0284917 0.1084511 -0.263
DIETQ12:COBCODBC2 0.2255369 0.1903993 1.185
DIETQ12:COBCODBC3 0.2406180 0.1373757 1.752
fit warnings:
fixed-effect model matrix is rank deficient so dropping 3 columns / coefficients
Code
Factor w/ 9 levels "0","1","2","3",..: 6 3 2 1 8 5 5 5 5 5 ...
Code
str(data$ALCSTR01) Factor w/ 9 levels "0","1","2","3",..: 4 2 2 1 6 4 4 4 4 4 ...
Code
dataSmoking
Code
Linear mixed model fit by REML ['lmerMod']
Formula: SYSTOL ~ (1 | AGEB) + (1 | SEX) + SMKSTAT * COBCODBC
Data: data_clean1
REML criterion at convergence: 323878.9
Scaled residuals:
Min 1Q Median 3Q Max
-3.9270 -0.6633 -0.0834 0.5600 5.8065
Random effects:
Groups Name Variance Std.Dev.
AGEB (Intercept) 134.155 11.583
SEX (Intercept) 6.109 2.472
Residual 328.402 18.122
Number of obs: 37512, groups: AGEB, 16; SEX, 2
Fixed effects:
Estimate Std. Error t value
(Intercept) 127.368674 3.395288 37.513
SMKSTAT2 -2.703954 1.133933 -2.385
SMKSTAT3 0.539468 1.810742 0.298
SMKSTAT4 -2.917129 0.346743 -8.413
SMKSTAT5 -1.831677 0.334580 -5.475
COBCODBC2 0.349355 0.766995 0.455
COBCODBC3 0.706628 0.772916 0.914
SMKSTAT2:COBCODBC2 -1.559212 2.915004 -0.535
SMKSTAT3:COBCODBC2 6.960866 5.391496 1.291
SMKSTAT4:COBCODBC2 -1.191154 0.875963 -1.360
SMKSTAT5:COBCODBC2 -2.272666 0.894549 -2.541
SMKSTAT2:COBCODBC3 1.027121 2.732123 0.376
SMKSTAT3:COBCODBC3 1.299832 5.807906 0.224
SMKSTAT4:COBCODBC3 1.135671 0.907714 1.251
SMKSTAT5:COBCODBC3 0.000531 0.853761 0.001
Code
summary(diastol_model1)Linear mixed model fit by REML ['lmerMod']
Formula: DIASTOL ~ (1 | AGEB) + (1 | SEX) + SMKSTAT * COBCODBC
Data: data_clean1
REML criterion at convergence: 285395.6
Scaled residuals:
Min 1Q Median 3Q Max
-4.5817 -0.6651 -0.0382 0.6109 5.2710
Random effects:
Groups Name Variance Std.Dev.
AGEB (Intercept) 15.5184 3.9393
SEX (Intercept) 0.1512 0.3888
Residual 117.7374 10.8507
Number of obs: 37512, groups: AGEB, 16; SEX, 2
Fixed effects:
Estimate Std. Error t value
(Intercept) 76.3083 1.0379 73.519
SMKSTAT2 -0.8670 0.6790 -1.277
SMKSTAT3 -0.5169 1.0842 -0.477
SMKSTAT4 -1.3119 0.2076 -6.319
SMKSTAT5 -1.0927 0.2003 -5.455
COBCODBC2 -0.1040 0.4592 -0.226
COBCODBC3 0.5782 0.4628 1.249
SMKSTAT2:COBCODBC2 1.3628 1.7454 0.781
SMKSTAT3:COBCODBC2 5.2133 3.2282 1.615
SMKSTAT4:COBCODBC2 -0.6894 0.5245 -1.314
SMKSTAT5:COBCODBC2 -1.1580 0.5356 -2.162
SMKSTAT2:COBCODBC3 4.9966 1.6359 3.054
SMKSTAT3:COBCODBC3 4.6543 3.4775 1.338
SMKSTAT4:COBCODBC3 0.7167 0.5435 1.319
SMKSTAT5:COBCODBC3 0.4795 0.5112 0.938
Code
data_clean2 <- data %>%
filter(!is.na(SYSTOL) & !is.na(DIASTOL) & !is.na(COBCODBC) & !is.na(TOTPAC) & !is.na(ALCWKLY) &
!is.na(AGEB) & !is.na(SEX) & !is.na(ALCUSUQ2) & !is.na(ALCSTR01) & !is.na(PHDKGW2))
data_clean2$TOTPAC_normalise = data_clean2$TOTPAC/data_clean2$PHDKGW2
data_clean2$ALCWKLY_normalise = data_clean2$ALCWKLY/data_clean2$PHDKGW2Code
# Define labels for COBCODBC
labels <- c("Australians", "Main English Speaking Countries", "Other")
# Create the plot with relabeled legend and x-axis labels for SMKSTAT
ggplot(data_clean2, aes(x = SMKSTAT, y = SYSTOL, color = COBCODBC, group = COBCODBC)) +
stat_summary(fun = "mean", geom = "point", size = 3) + # Points for means
stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC), size = 1) + # Line connecting means
labs(title = "Interaction Plot: Smoking Status and SYSTOL by Country of Birth",
x = "Smoking Status",
y = "SYSTOL",
color = "Country of Birth") + # Change legend title
scale_x_discrete(labels = c(
"0" = "Not applicable",
"1" = "Current \n smoker daily",
"2" = "Current \n smoker weekly",
"3" = "Current \n smoker \n less than weekly",
"4" = "Ex-smoker",
"5" = "Never smoked"
)) + # Custom x-axis labels for SMKSTAT
scale_color_manual(values = c("lightblue3", "tomato3", "palegreen4"), labels = labels) +
theme_bw()Alcohol
Code
hist(data$TOTPAC)Code
alcohol_mapping <- c(
"0" = 0, # Not applicable
"1" = 365, # Every day
"2" = 260, # 5 to 6 days a week (approx. 5.5 * 52)
"3" = 208, # 3 to 4 days a week (approx. 4 * 52)
"4" = 104, # 1 to 2 days a week (approx. 2 * 52)
"5" = 36, # 2 to 3 days a month (approx. 2.5 * 12)
"6" = 12, # 1 day a month
"7" = 6, # Less than once a month (approx. 0.5 * 12)
"8" = NA # Not known
)
# Apply the mapping to convert ALCUSUQ2 to numeric
data_clean2$ALCUSUQ2_numeric <- as.numeric(alcohol_mapping[as.character(data_clean2$ALCUSUQ2)])Code
# SYSTOL
data_clean2$TOTPAC_normalise <- as.numeric(data_clean2$TOTPAC_normalise )
data_clean2$ALCWKLY_normalise <- as.numeric(data_clean2$ALCWKLY_normalise )
systol_model2 <- lmer(
SYSTOL ~ (1 | AGEB) + (1 | SEX) + TOTPAC_normalise * COBCODBC + ALCWKLY_normalise * COBCODBC ,
data = data_clean2
)
diastol_model2 <- lmer(
DIASTOL ~ (1 | AGEB) + (1 | SEX) + TOTPAC_normalise * COBCODBC + ALCWKLY_normalise * COBCODBC,
data = data_clean2
)
# Summaries
summary(systol_model2)Linear mixed model fit by REML ['lmerMod']
Formula: SYSTOL ~ (1 | AGEB) + (1 | SEX) + TOTPAC_normalise * COBCODBC +
ALCWKLY_normalise * COBCODBC
Data: data_clean2
REML criterion at convergence: 173334.2
Scaled residuals:
Min 1Q Median 3Q Max
-3.9831 -0.6637 -0.1041 0.5583 6.0818
Random effects:
Groups Name Variance Std.Dev.
AGEB (Intercept) 121.16 11.007
SEX (Intercept) 9.22 3.036
Residual 308.69 17.570
Number of obs: 20214, groups: AGEB, 16; SEX, 2
Fixed effects:
Estimate Std. Error t value
(Intercept) 124.5794 3.4983 35.612
TOTPAC_normalise 0.2467 0.2529 0.975
COBCODBC2 0.1262 0.5384 0.234
COBCODBC3 1.2703 0.5452 2.330
ALCWKLY_normalise 0.3676 0.1298 2.832
TOTPAC_normalise:COBCODBC2 -0.5499 0.6075 -0.905
TOTPAC_normalise:COBCODBC3 0.2170 0.8144 0.266
COBCODBC2:ALCWKLY_normalise -0.1200 0.3282 -0.366
COBCODBC3:ALCWKLY_normalise -0.3522 0.4351 -0.809
Correlation of Fixed Effects:
(Intr) TOTPAC_n COBCODBC2 COBCODBC3 ALCWKL TOTPAC_:COBCODBC2
TOTPAC_nrml -0.031
COBCODBC2 -0.021 0.149
COBCODBC3 -0.022 0.153 0.148
ALCWKLY_nrm 0.013 -0.903 -0.042 -0.043
TOTPAC_:COBCODBC2 0.011 -0.382 -0.374 -0.065 0.348
TOTPAC_:COBCODBC3 0.008 -0.270 -0.051 -0.370 0.246 0.115
COBCODBC2:A -0.004 0.329 0.038 0.019 -0.371 -0.883
COBCODBC3:A -0.002 0.233 0.016 0.083 -0.266 -0.101
TOTPAC_:COBCODBC3 COBCODBC2:
TOTPAC_nrml
COBCODBC2
COBCODBC3
ALCWKLY_nrm
TOTPAC_:COBCODBC2
TOTPAC_:COBCODBC3
COBCODBC2:A -0.100
COBCODBC3:A -0.896 0.109
Code
summary(diastol_model2)Linear mixed model fit by REML ['lmerMod']
Formula: DIASTOL ~ (1 | AGEB) + (1 | SEX) + TOTPAC_normalise * COBCODBC +
ALCWKLY_normalise * COBCODBC
Data: data_clean2
REML criterion at convergence: 152993.5
Scaled residuals:
Min 1Q Median 3Q Max
-3.2625 -0.6510 -0.0539 0.6046 5.1914
Random effects:
Groups Name Variance Std.Dev.
AGEB (Intercept) 15.1074 3.887
SEX (Intercept) 0.1998 0.447
Residual 112.9066 10.626
Number of obs: 20214, groups: AGEB, 16; SEX, 2
Fixed effects:
Estimate Std. Error t value
(Intercept) 75.16128 1.03168 72.853
TOTPAC_normalise -0.35012 0.15289 -2.290
COBCODBC2 -0.08805 0.32561 -0.270
COBCODBC3 1.24535 0.32970 3.777
ALCWKLY_normalise 0.42307 0.07847 5.392
TOTPAC_normalise:COBCODBC2 0.28618 0.36741 0.779
TOTPAC_normalise:COBCODBC3 -0.13640 0.49250 -0.277
COBCODBC2:ALCWKLY_normalise -0.28393 0.19848 -1.431
COBCODBC3:ALCWKLY_normalise -0.17536 0.26314 -0.666
Correlation of Fixed Effects:
(Intr) TOTPAC_n COBCODBC2 COBCODBC3 ALCWKL TOTPAC_:COBCODBC2
TOTPAC_nrml -0.063
COBCODBC2 -0.044 0.149
COBCODBC3 -0.045 0.153 0.148
ALCWKLY_nrm 0.027 -0.903 -0.042 -0.043
TOTPAC_:COBCODBC2 0.023 -0.382 -0.374 -0.065 0.348
TOTPAC_:COBCODBC3 0.015 -0.271 -0.051 -0.370 0.246 0.115
COBCODBC2:A -0.008 0.329 0.038 0.019 -0.371 -0.883
COBCODBC3:A -0.005 0.233 0.016 0.083 -0.266 -0.101
TOTPAC_:COBCODBC3 COBCODBC2:
TOTPAC_nrml
COBCODBC2
COBCODBC3
ALCWKLY_nrm
TOTPAC_:COBCODBC2
TOTPAC_:COBCODBC3
COBCODBC2:A -0.100
COBCODBC3:A -0.896 0.109
Exercise
Code
0 1 2 9
0 14655 21821 0
Code
hist(data_clean3$EXLWMBC)Code
systol_model3 <- lmer(
SYSTOL ~ EXREGUIC * COBCODBC + (1 | AGEB) + (1 | SEX) +
EXREGUID * COBCODBC + EXFSRMNE * COBCODBC + EXLWMBC * COBCODBC + EXLWVBC * COBCODBC,
data = data_clean3
)
diastol_model3 <- lmer(
DIASTOL ~ EXREGUIC * COBCODBC + (1 | AGEB) + (1 | SEX) +
EXREGUID * COBCODBC + EXFSRMNE * COBCODBC + EXLWMBC * COBCODBC + EXLWVBC * COBCODBC,
data = data_clean3
)
# Summaries
summary(systol_model3)Linear mixed model fit by REML ['lmerMod']
Formula: SYSTOL ~ EXREGUIC * COBCODBC + (1 | AGEB) + (1 | SEX) + EXREGUID *
COBCODBC + EXFSRMNE * COBCODBC + EXLWMBC * COBCODBC + EXLWVBC *
COBCODBC
Data: data_clean3
REML criterion at convergence: 315700.4
Scaled residuals:
Min 1Q Median 3Q Max
-3.8484 -0.6688 -0.0926 0.5613 5.8003
Random effects:
Groups Name Variance Std.Dev.
AGEB (Intercept) 120.429 10.974
SEX (Intercept) 5.555 2.357
Residual 334.371 18.286
Number of obs: 36476, groups: AGEB, 15; SEX, 2
Fixed effects:
Estimate Std. Error t value
(Intercept) 1.255e+02 3.299e+00 38.047
EXREGUIC2 -6.408e-01 4.990e-01 -1.284
COBCODBC2 -2.250e+00 6.669e-01 -3.374
COBCODBC3 1.031e+00 6.467e-01 1.594
EXREGUID2 1.728e+00 4.987e-01 3.464
EXFSRMNE 7.816e-04 8.820e-04 0.886
EXLWMBC 2.040e-03 1.072e-03 1.902
EXLWVBC 1.562e-03 1.193e-03 1.310
EXREGUIC2:COBCODBC2 -1.296e+00 1.170e+00 -1.108
EXREGUIC2:COBCODBC3 -1.186e-01 1.147e+00 -0.103
COBCODBC2:EXREGUID2 3.504e+00 1.176e+00 2.979
COBCODBC3:EXREGUID2 2.080e-01 1.155e+00 0.180
COBCODBC2:EXFSRMNE 8.262e-04 2.017e-03 0.410
COBCODBC3:EXFSRMNE 3.070e-03 2.084e-03 1.473
COBCODBC2:EXLWMBC -4.331e-03 2.312e-03 -1.873
COBCODBC3:EXLWMBC -2.654e-03 2.811e-03 -0.944
COBCODBC2:EXLWVBC -5.127e-03 3.172e-03 -1.616
COBCODBC3:EXLWVBC -7.331e-03 3.433e-03 -2.135
Code
summary(diastol_model3)Linear mixed model fit by REML ['lmerMod']
Formula: DIASTOL ~ EXREGUIC * COBCODBC + (1 | AGEB) + (1 | SEX) + EXREGUID *
COBCODBC + EXFSRMNE * COBCODBC + EXLWMBC * COBCODBC + EXLWVBC *
COBCODBC
Data: data_clean3
REML criterion at convergence: 277745.9
Scaled residuals:
Min 1Q Median 3Q Max
-4.6512 -0.6636 -0.0400 0.6210 5.3538
Random effects:
Groups Name Variance Std.Dev.
AGEB (Intercept) 11.8821 3.4470
SEX (Intercept) 0.2909 0.5393
Residual 118.1252 10.8685
Number of obs: 36476, groups: AGEB, 15; SEX, 2
Fixed effects:
Estimate Std. Error t value
(Intercept) 75.7301093 0.9820006 77.118
EXREGUIC2 0.8049048 0.2965629 2.714
COBCODBC2 -0.4961945 0.3963551 -1.252
COBCODBC3 0.8276915 0.3843725 2.153
EXREGUID2 -0.1939415 0.2964021 -0.654
EXFSRMNE -0.0002955 0.0005242 -0.564
EXLWMBC 0.0004067 0.0006373 0.638
EXLWVBC -0.0048991 0.0007088 -6.912
EXREGUIC2:COBCODBC2 -2.7735352 0.6951454 -3.990
EXREGUIC2:COBCODBC3 1.7622351 0.6814629 2.586
COBCODBC2:EXREGUID2 2.6212556 0.6991956 3.749
COBCODBC3:EXREGUID2 -1.2073901 0.6863879 -1.759
COBCODBC2:EXFSRMNE -0.0012441 0.0011986 -1.038
COBCODBC3:EXFSRMNE -0.0008997 0.0012386 -0.726
COBCODBC2:EXLWMBC -0.0013247 0.0013745 -0.964
COBCODBC3:EXLWMBC 0.0022148 0.0016707 1.326
COBCODBC2:EXLWVBC -0.0080101 0.0018851 -4.249
COBCODBC3:EXLWVBC -0.0018109 0.0020406 -0.887
Code
# Define labels for COBCODBC
labels <- c("Australians", "Main English Speaking Countries", "Other")
# Create the plot with relabeled legend
ggplot(data_clean3, aes(x = EXREGUID, y = SYSTOL, color = COBCODBC, group = COBCODBC)) +
stat_summary(fun = "mean", geom = "point", size = 3) + # Points for means
stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC), size = 1) + # Line connecting means
labs(title = "Interaction Plot: Exercise Guidelines and SYSTOL by Country of Birth",
x = "Exercise Guidelines",
y = "SYSTOL",
color = "Country of Birth") + # Change legend title
scale_color_manual(values = c("lightblue3", "tomato3", "palegreen4"), labels = labels) +
theme_bw()Code
labels <- c("Australians", "Main English Speaking Countries", "Other")
ggplot(data_clean3, aes(x = EXLWVBC, y = SYSTOL, color = COBCODBC)) +
geom_point(alpha = 0.2) +
geom_smooth(method = "lm", se = FALSE) + # Add regression lines for each group
labs(title = "Interaction Between Minutes of Vigorous Exercise and SYSTOL by Country of Birth",
x = "Minutes of Vigorous Exercise",
y = "SYSTOL") + facet_wrap(~ COBCODBC, scales = "free_y") +
theme_bw()+ # Change legend title
scale_color_manual(values = c("lightblue4", "tomato3", "palegreen4"), labels = labels) +
theme_bw()Code
ggplot(data_clean3, aes(x = EXREGUID, y = SYSTOL, color = COBCODBC, group = COBCODBC)) +
stat_summary(fun = "mean", geom = "point", size = 3) + # Points for means
stat_summary(fun = "mean", geom = "line", aes(group = COBCODBC), size = 1) + # Line connecting means
labs(title = "Interaction Plot: Exercise Guidelines and SYSTOL by Country of Birth",
x = "Exercise Guidelines",
y = "SYSTOL",
color = "Country of Birth") + # Change legend title
scale_x_discrete(labels = c("1" = "Met Guidelines", "2" = "Did Not Meet Guidelines")) + # Custom x-axis labels for discrete scale
scale_color_manual(values = c("lightblue3", "tomato3", "palegreen4"), labels = labels) +
theme_bw()